Antonio Coín Castro

Bayesian Functional Logistic Regression

In [1]:
# -- Libraries

from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
import arviz as az
import numpy as np
import pandas as pd
from IPython.display import display
from itertools import product
import logging
import skfda
from skfda.preprocessing.dim_reduction.variable_selection import (
    RKHSVariableSelection as RKVS,
    RecursiveMaximaHunting as RMH,
    MinimumRedundancyMaximumRelevance as mRMR,
)
import os
from _lda import LDA
from _fpls import FPLS, APLS, FPLSBasis
from _fpca_basis import FPCABasis
from skfda.preprocessing.dim_reduction.feature_extraction import FPCA
from skfda.representation.basis import FDataBasis, Fourier, BSpline
from skfda.representation.grid import FDataGrid
from skfda.preprocessing.smoothing import BasisSmoother
from skfda.preprocessing.smoothing.validation import (
    SmoothingParameterSearch,
    LinearSmootherGeneralizedCVScorer,
    akaike_information_criterion
)
from skfda.preprocessing.smoothing.kernel_smoothers import (
    NadarayaWatsonSmoother as NW
)
from skfda.ml.classification import (
    MaximumDepthClassifier, KNeighborsClassifier,
    NearestCentroid,
)
from skfda.exploratory.depth import IntegratedDepth, ModifiedBandDepth
import warnings
from sklearn.model_selection import (
    train_test_split, GridSearchCV, StratifiedKFold
)
from sklearn.exceptions import ConvergenceWarning
from sklearn.svm import SVC, LinearSVC
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.utils.validation import check_is_fitted
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
import sys
import pickle
import scipy
from multiprocessing import Pool
import utils
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
In [2]:
# -- Configuration

# Extensions
%load_ext autoreload
%autoreload 2

# Plotting configuration
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams['figure.figsize'] = [6, 4]
plt.style.use('arviz-darkgrid')
NCOLS = 3


def NROWS(x, ncols=NCOLS):
    return np.ceil(x/ncols).astype('int')


# Randomness and reproducibility
SEED = 42
np.random.seed(SEED)
rng = np.random.default_rng(SEED)

# Floating point precision for display
np.set_printoptions(precision=3, suppress=True)
pd.set_option("display.precision", 3)

# Multiprocessing
N_CORES = 4

# Ignore warnings
np.seterr(over='ignore', divide='ignore')
os.environ["PYTHONWARNINGS"] = 'ignore::UserWarning'

We consider the binary $\{0,1\}$-model

$$ \mathbb P(Y=1\mid X=x) = \frac{1}{1 + \exp\{-\alpha_0-\Psi^{-1}_{x}(\alpha)\}}, $$

i.e., each $Y_i$ can be seen as a Bernoulli random variable $\mathcal B(p(x_i))$ with

$$ p(x_i)\equiv p_i=\mathbb P(Y_i=1\mid X_i=x_i) = \frac{1}{1 + \exp\left\{-\alpha_0-\displaystyle\sum_{j=1}^p \beta_jx_i(\tau_j)\right\}}. $$

Note that $\mathbb E[Y_i]=p_i$ and $\operatorname{Var}(Y_i)=p_i(1-p_i)$.

The prior distributions we choose are:

\begin{align*} \pi(\alpha_0, \sigma^2) & \propto 1/\sigma^2, \\ \tau & \sim \mathscr U([0, 1]^p), \\ \beta\mid \tau, \sigma^2 & \sim \mathcal N\left(b_0, g\sigma^2\left[\mathcal X_\tau' \mathcal X_\tau + \eta \lambda_{\text{max}}(\mathcal X_\tau' \mathcal X_\tau)\right]^{-1}\right). \end{align*}

Note that for computational reasons we will work with $\log \sigma$ instead of $\sigma^2$, and hence the associated prior distribution is

$$ \pi(\alpha_0, \log\sigma) \propto 1. $$

Writing the parameter vector as $\theta = (\beta, \tau, \alpha_0, \log \sigma)$, the joint log-posterior probability is:

$$ \log \pi(\beta, \tau, \alpha_0, \log\sigma\mid Y) \propto \sum_{i=1}^n \left[ \left(\alpha_0 + \Psi^{-1}_{x_i}(\alpha)\right)y_i - \log\left(1 + \exp\left\{\alpha_0 + \Psi_{x_i}^{-1}(\alpha)\right\}\right)\right] + \frac{1}{2}\log |G_\tau| - p\log \sigma -\frac{1}{2g\sigma^2} (\beta - b_0)'G_\tau(\beta - b_0). $$

The metrics considered for model evaluation will be:

  • Accuracy.

Example dataset

We generate a toy dataset with $n=100$ functional regressors $X_i(t) \sim GP(0, K(s, t))$, a response variable given by either a $L^2$ model or a "simple" RKHS function, and a value of $\alpha_0=-0.5$. More precisely, we choose one of

$$ Y_i \sim \mathcal B\left(\frac{1}{1 + \exp\left\{0.5 + 5X_i(0.1) - 10X_i(0.8)\right\}}\right) $$

or

$$ Y_i \sim \mathcal B\left(\frac{1}{1 + \exp\left\{0.5 -\int_0^1 \beta(t)X_i(t)\, dt\right\}}\right), $$

where $\beta(t) \in L^2[0, 1]$.

Another possibility is to generate a dataset from two different Gaussian processes, and label them according to the (known) distribution of $X^{(j)}$.

We consider a regular grid of $N=100$ points on $[0, 1]$. In addition, we center the $X_i$ so that they have zero mean when fed to the sampling algorithms.

We also generate a test dataset with $n_{\text{test}}=50$ regressors for model evaluation.

In [3]:
# -- Data generation parameters

SYNTHETIC_DATA = True
MODEL_GEN = "RKHS"  # 'L2', 'RKHS' or 'MIXTURE'
REAL_DATA = "Medflies"
NOISE = 0.10

INITIAL_SMOOTHING = "NW"  # 'NW' or 'Basis'
N_BASIS = 16
STANDARDIZE_PREDICTORS = False

kernel_fn = utils.fractional_brownian_kernel
kernel_fn2 = utils.squared_exponential_kernel
beta_coef = utils.cholaquidis_scenario3

basis = BSpline(n_basis=N_BASIS)
smoothing_params = np.logspace(-4, 4, 50)
In [4]:
# -- Dataset generation

if SYNTHETIC_DATA:
    n_train, n_test = 100, 50
    N = 100
    grid = np.linspace(1./N, 1., N)

    mean_vector = None
    mean_vector2 = np.ones(N)

    beta_true = np.array([-5., 10.])
    tau_true = np.array([0.1, 0.8])
    alpha0_true = -0.5

    if MODEL_GEN == "MIXTURE":
        x, y = utils.generate_classification_dataset(
            grid, kernel_fn, kernel_fn2,
            n_train + n_test, rng,
            mean_vector, mean_vector2)
    else:
        if MODEL_GEN == "L2":
            x, y_lin = utils.generate_gp_l2_dataset(
                grid, kernel_fn,
                n_train + n_test, beta_coef,
                alpha0_true, 0.0, rng=rng
            )
        elif MODEL_GEN == "RKHS":
            x, y_lin = utils.generate_gp_rkhs_dataset(
                grid, kernel_fn,
                n_train + n_test, beta_true, tau_true,
                alpha0_true, 0.0, rng=rng
            )
        else:
            raise ValueError("Invalid model generation strategy.")

        # Transform linear response for logistic model
        y = utils.transform_linear_response(y_lin, noise=NOISE, rng=rng)

    # Train/test split
    X, X_test, Y, Y_test = train_test_split(
        x, y, train_size=n_train, stratify=y,
        random_state=SEED)

    # Create FData object
    X_fd = skfda.FDataGrid(X, grid)
    X_test_fd = skfda.FDataGrid(X_test, grid)

else:
    if REAL_DATA == "Medflies":
        x, y = skfda.datasets.fetch_medflies(return_X_y=True)
    elif REAL_DATA == "Growth":
        x, y = skfda.datasets.fetch_growth(return_X_y=True)
    else:
        raise ValueError("REAL_DATA must be 'Medflies' or 'Growth'.")

    X_fd, X_test_fd, Y, Y_test = train_test_split(
        x, y, train_size=0.8, stratify=y, random_state=SEED)

    N = len(X_fd.grid_points[0])
    grid = np.linspace(1./N, 1., N)  # TODO: use (normalized) real grid
    n_train, n_test = len(X_fd.data_matrix), len(X_test_fd.data_matrix)

if INITIAL_SMOOTHING is not None:
    if INITIAL_SMOOTHING == "NW":
        smoother = NW()
    elif INITIAL_SMOOTHING == "Basis":
        smoother = BasisSmoother(basis)
    else:
        raise ValueError(
            f"Expected 'NW' or 'Basis' but got {INITIAL_SMOOTHING}.")

    best_smoother = SmoothingParameterSearch(
        smoother,
        smoothing_params,
        scoring=LinearSmootherGeneralizedCVScorer(
            akaike_information_criterion),
        n_jobs=-1,
    )

    with utils.IgnoreWarnings():
        best_smoother.fit(X_fd)

    X_fd = best_smoother.transform(X_fd)
    X_test_fd = best_smoother.transform(X_test_fd)
    print(f"Smoother: {best_smoother.best_estimator_.__class__.__name__}")
    print(
        f"Smoothing parameter: {best_smoother.best_params_['smoothing_parameter']:.3f}")

if STANDARDIZE_PREDICTORS:
    X_sd = np.sqrt(X_fd.var())
else:
    X_sd = 1.0

# Standardize data
X_m = X_fd.mean(axis=0)
X_fd = (X_fd - X_m)/X_sd
X = X_fd.data_matrix.reshape(-1, N)
X_test_fd = (X_test_fd - X_m)/X_sd
X_test = X_test_fd.data_matrix.reshape(-1, N)

if SYNTHETIC_DATA:
    n_samples = n_train//2
elif REAL_DATA == "Medflies":
    n_samples = n_train//5
else:
    n_samples = n_train

utils.plot_dataset_classification(X, Y, figsize=(10, 4), n_samples=n_samples)
 /home/antcc/MCD/TFM/bayesian-functional-regression/venv-bfr-py39/lib/python3.9/site-packages/sklearn/model_selection/_search.py:969: UserWarning:One or more of the test scores are non-finite: [   nan    nan    nan    nan    nan    nan    nan -0.001 -0.001 -0.001
 -0.001 -0.001 -0.    -0.    -0.001 -0.001 -0.001 -0.002 -0.003 -0.007
 -0.013 -0.026 -0.048 -0.071 -0.088 -0.098 -0.103 -0.105 -0.106 -0.107
 -0.107 -0.107 -0.107 -0.107 -0.107 -0.107 -0.107 -0.107 -0.107 -0.107
 -0.107 -0.107 -0.107 -0.107 -0.107 -0.107 -0.107 -0.107 -0.107 -0.107]
Smoother: NadarayaWatsonSmoother
Smoothing parameter: 0.013

Common model hyperparameters

In our algorithms, we consider an unconstrained tranformed parameter space $\tilde \Theta=\mathbb{R}^{2\hat p+2}$ via the bijections

  • $\tau_j \mapsto \operatorname{logit}(\tau_j)$.
  • $\sigma^2 \mapsto \log\sigma$.
In [5]:
# -- Model hyperparameters

p_hat = 3
g = 5
eta = 0.1

TRANSFORM_TAU = False
FIT_SK = True
In [6]:
# -- Names and labels

# Names of parameters
theta_names = ["β", "τ", "α0", "σ2"]
if TRANSFORM_TAU:
    theta_names_ttr = ["β", "logit τ", "α0", "log σ"]
else:
    theta_names_ttr = ["β", "τ", "α0", "log σ"]
theta_names_aux = ["α0 and log σ"]

# Grouped labels
theta_labels_grouped = [r"$\beta$", r"$\tau$", r"$\alpha_0$", r"$\sigma^2$"]

# Individual labels
theta_labels = []
for i in range(p_hat):
    theta_labels.append(fr"$\beta_{i + 1}$")
for i in range(p_hat):
    theta_labels.append(fr"$\tau_{i + 1}$")
theta_labels.append(theta_labels_grouped[-2])
theta_labels.append(theta_labels_grouped[-1])

# Labels for Arviz
theta_labeller = az.labels.MapLabeller(
    var_name_map=dict(zip(theta_names[-2:], theta_labels_grouped[-2:])),
    coord_map={"projection": dict(
        zip(np.arange(p_hat), np.arange(1, p_hat + 1)))}
)

# Dimension of parameter vector
theta_ndim = len(theta_labels)

# Dimension of grouped parameter vector
theta_ndim_grouped = len(theta_names)

# Names of results columns
results_columns = ["Estimator", "Features", "Accuracy"]
In [7]:
# -- Parameter space and miscellaneous

if TRANSFORM_TAU:
    tau_ttr = utils.Logit()
else:
    tau_ttr = utils.Identity()

# Parameter space
theta_space = utils.ThetaSpace(
    p_hat, grid, theta_names, theta_names_ttr, theta_labels, tau_ttr=tau_ttr)

# Statistics for posterior predictive checks
statistics = [
    ("min", np.min),
    ("max", np.max),
    ("median", np.median),
    ("mean", np.mean),
    ("std", np.std)]

# Point estimates for posterior distribution
point_estimates = ["mode", "mean", "median"]

Sklearn model comparison

In [8]:
# -- Custom CV and transformers

def cv_sk(classifiers, folds, X, Y, X_test, Y_test, verbose=False):
    df_metrics_sk = pd.DataFrame(columns=results_columns)

    for i, (name, pipe, params) in enumerate(classifiers):
        if verbose:
            print(f"Fitting {name}...")
        clf_cv = GridSearchCV(pipe, params, scoring="accuracy",
                              n_jobs=N_CORES, cv=folds)

        with utils.IgnoreWarnings():
            clf_cv.fit(X, Y)

        Y_hat_sk = clf_cv.predict(X_test)
        metrics_sk = utils.classification_metrics(Y_test, Y_hat_sk)

        if name == "sk_fknn":
            n_features = f"K={clf_cv.best_params_['clf__n_neighbors']}"
        elif name == "sk_mdc" or name == "sk_fnc":
            n_features = X.data_matrix.shape[1]
        elif name == "sk_flr":
            n_features = clf_cv.best_estimator_["clf"].p
        elif "pls1" in name:
            n_features = \
                clf_cv.best_estimator_["clf"].base_regressor.n_components
        elif "svm" in name:
            n_features = clf_cv.best_estimator_["clf"].n_features_in_
        else:
            if isinstance(clf_cv.best_estimator_["clf"].coef_[0], FDataBasis):
                coef = clf_cv.best_estimator_["clf"].coef_[0].coefficients[0]
            elif "sk_logistic" in name:
                coef = clf_cv.best_estimator_["clf"].coef_[0]
            else:
                coef = clf_cv.best_estimator_["clf"].coef_

            n_features = sum(~np.isclose(coef, 0))

        df_metrics_sk.loc[i] = [
            name,
            n_features,
            metrics_sk["acc"]]

        df_metrics_sk.sort_values(
            results_columns[-1], inplace=True, ascending=False)

    return df_metrics_sk, clf_cv


def bayesian_var_sel(idata, theta_space, names,
                     X, Y, X_test, Y_test, folds,
                     prefix, point_est='mode',
                     verbose=False):
    grid = theta_space.grid
    p_hat = theta_space.p
    tau_hat = utils.point_estimate(
        idata, point_est, names)[p_hat:2*p_hat]
    idx_hat = np.abs(grid - tau_hat[:, np.newaxis]).argmin(1)

    classifiers_var_sel = []
    Cs = np.logspace(-4, 4, 20)
    params_clf = {"clf__C": Cs}
    params_svm = {"clf__gamma": ['auto', 'scale']}

    # Emcee+LR
    classifiers_var_sel.append((f"{prefix}_{point_est}+sk_logistic",
                               Pipeline([
                                   ("var_sel", VariableSelection(grid, idx_hat)),
                                   ("data_matrix", DataMatrix()),
                                   ("clf", LogisticRegression(random_state=SEED))]),
                               params_clf
                                ))

    # Emcee+SVM Linear
    classifiers_var_sel.append((f"{prefix}_{point_est}+sk_svm_lin",
                               Pipeline([
                                   ("var_sel", VariableSelection(grid, idx_hat)),
                                   ("data_matrix", DataMatrix()),
                                   ("clf", LinearSVC(random_state=SEED))]),
                               params_clf
                                ))

    # Emcee+SVM RBF
    classifiers_var_sel.append((f"{prefix}_{point_est}+sk_svm_rbf",
                               Pipeline([
                                   ("var_sel", VariableSelection(grid, idx_hat)),
                                   ("data_matrix", DataMatrix()),
                                   ("clf", SVC(kernel='rbf'))]),
                               {**params_svm, **params_clf}
                                ))

    df_metrics_var_sel, _ = cv_sk(classifiers_var_sel, folds,
                                  X_fd, Y, X_test_fd, Y_test, verbose)

    return df_metrics_var_sel


class FeatureSelector(BaseEstimator, TransformerMixin):

    def __init__(self, p=1):
        self.p = p

    def fit(self, X, y=None):
        N = X.shape[1]
        self.idx_ = np.linspace(0, N - 1, self.p).astype(int)
        return self

    def transform(self, X, y=None):
        return X[:, self.idx_]


class DataMatrix(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        self.N = len(X.grid_points[0])
        return self

    def transform(self, X, y=None):
        return X.data_matrix.reshape(-1, self.N)


class Basis(BaseEstimator, TransformerMixin):

    def __init__(self, basis=Fourier()):
        self.basis = basis

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.to_basis(self.basis)


class VariableSelection(BaseEstimator, TransformerMixin):

    def __init__(self, grid=None, idx=None):
        self.grid = grid
        self.idx = idx
        self.idx.sort()

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return FDataGrid(X.data_matrix[:, self.idx], self.grid[self.idx])


class PLSRegressionWrapper(PLSRegression):

    def transform(self, X, y=None):
        return super().transform(X)

    def fit_transform(self, X, y):
        return self.fit(X, y).transform(X)

    def predict(self, X, copy=True):
        check_is_fitted(self)

        if self.coef_.shape[1] == 1:  # if n_targets == 1
            return super().predict(X, copy)[:, 0]
        else:
            return super().predict(X, copy)
In [9]:
# -- Select family of classifiers

classifiers = []
Cs = np.logspace(-4, 4, 20)
n_selected = [5, 10, 15, 20, 25, X.shape[1]]
n_components = [2, 3, 4, 5, 10]
n_basis_bsplines = [8, 10, 12, 14, 16]
n_basis_fourier = [3, 5, 7, 9, 11]
basis_bspline = [BSpline(n_basis=p) for p in n_basis_bsplines]
basis_fourier = [Fourier(n_basis=p) for p in n_basis_fourier]
basis_fpls = [FPLSBasis(X_fd, Y, n_basis=p) for p in n_components]

ridge_regressors = [Ridge(alpha=C) for C in Cs]
lasso_regressors = [Lasso(alpha=C) for C in Cs]
pls_regressors = [PLSRegressionWrapper(n_components=p) for p in n_components]
fpls_regressors = [FPLS(n_components=p) for p in n_components]
apls_regressors = [APLS(n_components=p) for p in n_components]
n_neighbors = [3, 5, 7]

params_clf = {"clf__C": Cs}
params_svm = {"clf__gamma": ['auto', 'scale']}
params_select = {"selector__p": n_selected}
params_dim_red = {"dim_red__n_components": n_components}
params_basis = {"basis__basis": basis_bspline + basis_fourier}
params_basis_fpca = {"basis__n_basis": n_components}
params_basis_fpls = {"basis__basis": basis_fpls}
params_var_sel = {"var_sel__n_features_to_select": n_components}
params_knn = {"clf__n_neighbors": n_neighbors,
              "clf__weights": ['uniform', 'distance']}
params_depth = {"clf__depth_method": [ModifiedBandDepth(), IntegratedDepth()]}
params_mrmr = {"var_sel__method": ["MID", "MIQ"]}
params_base_regressors_ridge = {"clf__base_regressor": ridge_regressors}
params_base_regressors_lasso = {"clf__base_regressor": lasso_regressors}
params_base_regressors_pls = {"clf__base_regressor": pls_regressors}
params_base_regressors_fpls = {"clf__base_regressor": fpls_regressors}
params_base_regressors_apls = {"clf__base_regressor": apls_regressors}

"""
MULTIVARIATE MODELS
"""

# LDA (based on FPCA+Ridge regression)
classifiers.append(("sk_lda_fpca+ridge",
                   Pipeline([
                       ("dim_red", FPCA()),
                       ("clf", LDA())]),
                   {**params_dim_red, **params_base_regressors_ridge}
                    ))

"""
TARDA DEMASIADO (búsqueda en CV demasiado grande?)

# LDA (based on FPLS (fixed basis)+Ridge regression)
classifiers.append(("sk_lda_fpls_basis+ridge",
                   Pipeline([
                       ("basis", Basis()),
                       ("dim_red", FPLS()),
                       ("clf", LDA())]),
                   {**params_basis, 
                    **params_dim_red, 
                    **params_base_regressors_ridge}
                    ))
"""

# LDA (based on Lasso regression)
classifiers.append(("sk_lda_lasso",
                   Pipeline([
                       ("data_matrix", DataMatrix()),
                       ("clf", LDA())]),
                   params_base_regressors_lasso
                    ))

# LDA (based on PLS1 regression)
classifiers.append(("sk_lda_pls1",
                   Pipeline([
                       ("data_matrix", DataMatrix()),
                       ("clf", LDA())]),
                   params_base_regressors_pls
                    ))

# LDA (based on Manual+Ridge regression)
classifiers.append(("sk_lda_manual+ridge",
                   Pipeline([
                       ("data_matrix", DataMatrix()),
                       ("selector", FeatureSelector()),
                       ("clf", LDA())]),
                   {**params_select, **params_base_regressors_ridge}
                    ))

# LDA (based on PCA+Ridge regression)
classifiers.append(("sk_lda_pca+ridge",
                   Pipeline([
                       ("data_matrix", DataMatrix()),
                       ("dim_red", PCA(random_state=SEED)),
                       ("clf", LDA())]),
                   {**params_dim_red, **params_base_regressors_ridge}
                    ))

# LDA (based on PLS+Ridge regression)
classifiers.append(("sk_lda_pls+ridge",
                   Pipeline([
                       ("data_matrix", DataMatrix()),
                       ("dim_red", PLSRegressionWrapper()),
                       ("clf", LDA())]),
                   {**params_dim_red, **params_base_regressors_ridge}
                    ))

# LDA (based on RMH+Ridge regression)
classifiers.append(("sk_lda_rmh+ridge",
                   Pipeline([
                       ("var_sel", RMH()),
                       ("clf", LDA())]),
                   params_base_regressors_ridge
                    ))

"""
TARDA DEMASIADO (búsqueda en CV demasiado grande?)

# LDA (based on mRMR+Ridge regression)
classifiers.append(("sk_lda_mRMR+ridge",
                   Pipeline([
                       ("data_matrix", DataMatrix()),
                       ("var_sel", mRMR()),
                       ("clf", LDA())]),
                   {**params_var_sel, 
                    **params_mrmr, 
                    **params_base_regressors_ridge}
                   ))
"""

"""
VARIABLE SELECTION + MULTIVARIATE MODELS
"""

# Manual+LR
classifiers.append(("manual_sel+sk_logistic",
                   Pipeline([
                       ("data_matrix", DataMatrix()),
                       ("selector", FeatureSelector()),
                       ("clf", LogisticRegression(random_state=SEED))]),
                   {**params_clf, **params_select}
                    ))

# FPCA+LR
classifiers.append(("fpca+sk_logistic",
                   Pipeline([
                       ("dim_red", FPCA()),  # Retains scores only
                       ("clf", LogisticRegression(random_state=SEED))]),
                   {**params_dim_red, **params_clf}
                    ))

# PCA+LR
classifiers.append(("pca+sk_logistic",
                   Pipeline([
                       ("data_matrix", DataMatrix()),
                       ("dim_red", PCA(random_state=SEED)),
                       ("clf", LogisticRegression(random_state=SEED))]),
                   {**params_dim_red, **params_clf}
                    ))

"""
TARDA DEMASIADO (búsqueda en CV demasiado grande?)

# FPLS (fixed basis)+LR
classifiers.append(("fpls_basis+sk_logistic",
                   Pipeline([
                       ("basis", Basis()),
                       ("dim_red", FPLS()),
                       ("clf", LogisticRegression(random_state=SEED))]),
                   {**params_basis, **params_dim_red, **params_clf}
                    ))
"""

# PLS+LR
classifiers.append(("pls+sk_logistic",
                   Pipeline([
                       ("data_matrix", DataMatrix()),
                       ("dim_red", PLSRegressionWrapper()),
                       ("clf", LogisticRegression(random_state=SEED))]),
                   {**params_dim_red, **params_clf}
                    ))

# RKVS+LR
classifiers.append(("rkvs+sk_logistic",
                   Pipeline([
                       ("var_sel", RKVS()),
                       ("clf", LogisticRegression(random_state=SEED))]),
                   params_var_sel
                    ))

# RMH+LR
classifiers.append(("rmh+sk_logistic",
                   Pipeline([
                       ("var_sel", RMH()),
                       ("clf", LogisticRegression(random_state=SEED))]),
                   {}
                    ))

"""
TARDA DEMASIADO (búsqueda en CV demasiado grande?)

# mRMR+LR
classifiers.append(("mRMR+sk_logistic",
                   Pipeline([
                       ("var_sel", mRMR()),
                       ("clf", LogisticRegression(random_state=SEED))]),
                   {**params_var_sel, **params_mrmr}
                    ))
"""

# Manual+SVM Linear
classifiers.append(("manual_sel+sk_svm_lin",
                   Pipeline([
                       ("data_matrix", DataMatrix()),
                       ("selector", FeatureSelector()),
                       ("clf", LinearSVC(random_state=SEED))]),
                   {**params_select, **params_clf}
                    ))

# FPCA+SVM Linear
classifiers.append(("fpca+sk_svm_lin",
                   Pipeline([
                       ("dim_red", FPCA()),  # Retains scores only
                       ("clf", LinearSVC(random_state=SEED))]),
                   {**params_dim_red, **params_clf}
                    ))

# PCA+SVM Linear
classifiers.append(("pca+sk_svm_lin",
                   Pipeline([
                       ("data_matrix", DataMatrix()),
                       ("dim_red", PCA(random_state=SEED)),
                       ("clf", LinearSVC(random_state=SEED))]),
                   {**params_dim_red, **params_clf}
                    ))
"""
TARDA DEMASIADO (búsqueda en CV demasiado grande?)

# FPLS (fixed basis)+SVM Linear
classifiers.append(("fpls_basis+sk_svm_lin",
                   Pipeline([
                       ("basis", Basis()),
                       ("dim_red", FPLS()),
                       ("clf", LinearSVC(random_state=SEED))]),
                   {**params_basis, **params_dim_red, **params_clf}
                    ))
"""

# PLS+SVM Linear
classifiers.append(("pls+sk_svm_lin",
                   Pipeline([
                       ("data_matrix", DataMatrix()),
                       ("dim_red", PLSRegressionWrapper()),
                       ("clf", LinearSVC(random_state=SEED))]),
                   {**params_dim_red, **params_clf}
                    ))

# RKVS+SVM Linear
classifiers.append(("rkvs+sk_svm_lin",
                   Pipeline([
                       ("var_sel", RKVS()),
                       ("clf", LinearSVC(random_state=SEED))]),
                   {**params_var_sel, **params_clf}
                    ))

# RMH+SVM Linear
classifiers.append(("rmh+sk_svm_lin",
                   Pipeline([
                       ("var_sel", RMH()),
                       ("clf", LinearSVC(random_state=SEED))]),
                   params_clf
                    ))

"""
TARDA DEMASIADO (búsqueda en CV demasiado grande?)

# mRMR+SVM Linear
classifiers.append(("mRMR+sk_svm_lin",
                   Pipeline([
                       ("var_sel", mRMR()),
                       ("clf", LinearSVC(random_state=SEED))]),
                   {**params_var_sel, **params_mrmr, **params_clf}
                    ))
"""

# Manual+SVM RBF
classifiers.append(("manual_sel+sk_svm_rbf",
                   Pipeline([
                       ("data_matrix", DataMatrix()),
                       ("selector", FeatureSelector()),
                       ("clf", SVC(kernel='rbf'))]),
                   {**params_select, **params_clf, **params_svm}
                    ))

# FPCA+SVM RBF
classifiers.append(("fpca+sk_svm_rbf",
                   Pipeline([
                       ("dim_red", FPCA()),  # Retains scores only
                       ("clf", SVC(kernel='rbf'))]),
                   {**params_dim_red, **params_clf, **params_svm}
                    ))

# PCA+SVM RBF
classifiers.append(("pca+sk_svm_rbf",
                   Pipeline([
                       ("data_matrix", DataMatrix()),
                       ("dim_red", PCA(random_state=SEED)),
                       ("clf", SVC(kernel='rbf'))]),
                   {**params_dim_red, **params_clf, **params_svm}
                    ))

"""
TARDA DEMASIADO (búsqueda en CV demasiado grande?)

# FPLS (fixed basis)+SVM RBF
classifiers.append(("fpls_basis+sk_svm_rbf",
                   Pipeline([
                       ("basis", Basis()),
                       ("dim_red", FPLS()),
                       ("clf", SVC(kernel='rbf'))]),
                   {**params_basis, 
                    **params_dim_red, 
                    **params_clf, 
                    **params_svm}
                    ))
"""

# PLS+SVM RBF
classifiers.append(("pls+sk_svm_rbf",
                   Pipeline([
                       ("data_matrix", DataMatrix()),
                       ("dim_red", PLSRegressionWrapper()),
                       ("clf", SVC(kernel='rbf'))]),
                   {**params_dim_red, **params_clf, **params_svm}
                    ))

# RKVS+SVM RBF
classifiers.append(("rkvs+sk_svm_rbf",
                   Pipeline([
                       ("var_sel", RKVS()),
                       ("clf", SVC(kernel='rbf'))]),
                   {**params_var_sel, **params_clf, **params_svm}
                    ))

# RMH+SVM RBF
classifiers.append(("rmh+sk_svm_rbf",
                   Pipeline([
                       ("var_sel", RMH()),
                       ("clf", SVC(kernel='rbf'))]),
                   {**params_clf, **params_svm}
                    ))

"""
TARDA DEMASIADO (búsqueda en CV demasiado grande?)

# mRMR+SVM RBF
classifiers.append(("mRMR+sk_svm_rbf",
                   Pipeline([
                       ("var_sel", mRMR()),
                       ("clf", SVC(kernel='rbf'))]),
                   {**params_var_sel, 
                    **params_mrmr, 
                    **params_clf, 
                    **params_svm}
                    ))
"""

"""
FUNCTIONAL MODELS
"""

"""
TARDA BASTANTE 

# Functional Logistic Regression Model (testing)
from _logistic_regression_TEMP import LogisticRegression as FLogisticRegression
params_flr = {"clf__p": n_components}

classifiers.append(("sk_flr",
                   Pipeline([
                       ("clf", FLogisticRegression())]),
                   params_flr
                    ))
"""

# Maximum Depth Classifier
classifiers.append(("sk_mdc",
                   Pipeline([
                       ("clf", MaximumDepthClassifier())]),
                   params_depth
                    ))

# KNeighbors Functional Classification
classifiers.append(("sk_fknn",
                   Pipeline([
                       ("clf", KNeighborsClassifier())]),
                   params_knn
                    ))

# Nearest Centroid Functional Classification
classifiers.append(("sk_fnc",
                   Pipeline([
                       ("clf", NearestCentroid())]),
                   {}
                    ))

# NOTE: while not strictly necessary, the test data undergoes the
# same basis expansion process as the training data. This is more
# computationally efficient and seems to improve the performance.

# Functional LDA (based on L^2-regression with fixed basis)
classifiers.append(("sk_flda_l2_basis",
                   Pipeline([
                       ("basis", Basis()),
                       ("clf", LDA())]),
                   params_basis
                    ))

"""
TARDA BASTANTE (cálculo de Gram matrix costoso en la base)

# Functional LDA (based on L^2-regression with FPCA basis)
classifiers.append(("sk_flda_l2_khl",
                   Pipeline([
                       ("basis", FPCABasis()),
                       ("clf", LDA())]),
                   params_basis_fpca
                    ))
"""

"""
TARDA BASTANTE (cálculo de Gram matrix costoso en la base)

# Functional LDA (based on L^2-regression with FPLS basis)
classifiers.append(("sk_flda_l2_fpls",
                   Pipeline([
                       ("basis", Basis()),
                       ("clf", LDA())]),
                   params_basis_fpls
                    ))
"""

# Functional LDA (based on FPLS1 regression with fixed basis)
classifiers.append(("sk_flda_fpls1_basis",
                   Pipeline([
                       ("basis", Basis()),
                       ("clf", LDA())]),
                   {**params_basis, **params_base_regressors_fpls}
                    ))

# Functional LDA (based on APLS regression)
classifiers.append(("sk_flda_apls",
                   Pipeline([
                       ("clf", LDA())]),
                   params_base_regressors_apls
                    ))
In [10]:
# -- Fit models and show metrics

folds = StratifiedKFold(shuffle=True, random_state=SEED)

if FIT_SK:
    df_metrics_sk, clf_cv = cv_sk(classifiers, folds, X_fd, Y,
                                  X_test_fd, Y_test, verbose=True)
    display(df_metrics_sk.style.hide_index())
Fitting sk_lda_fpca+ridge...
Fitting sk_lda_lasso...
Fitting sk_lda_pls1...
Fitting sk_lda_manual+ridge...
Fitting sk_lda_pca+ridge...
Fitting sk_lda_pls+ridge...
Fitting sk_lda_rmh+ridge...
Fitting manual_sel+sk_logistic...
Fitting fpca+sk_logistic...
Fitting pca+sk_logistic...
Fitting pls+sk_logistic...
Fitting rkvs+sk_logistic...
Fitting rmh+sk_logistic...
Fitting manual_sel+sk_svm_lin...
Fitting fpca+sk_svm_lin...
Fitting pca+sk_svm_lin...
 /home/antcc/MCD/TFM/bayesian-functional-regression/venv-bfr-py39/lib/python3.9/site-packages/sklearn/svm/_base.py:1206: ConvergenceWarning:Liblinear failed to converge, increase the number of iterations.
Fitting pls+sk_svm_lin...
Fitting rkvs+sk_svm_lin...
Fitting rmh+sk_svm_lin...
Fitting manual_sel+sk_svm_rbf...
Fitting fpca+sk_svm_rbf...
Fitting pca+sk_svm_rbf...
Fitting pls+sk_svm_rbf...
Fitting rkvs+sk_svm_rbf...
Fitting rmh+sk_svm_rbf...
Fitting sk_mdc...
Fitting sk_fknn...
Fitting sk_fnc...
Fitting sk_flda_l2_basis...
Fitting sk_flda_fpls1_basis...
 /home/antcc/MCD/TFM/bayesian-functional-regression/venv-bfr-py39/lib/python3.9/site-packages/sklearn/model_selection/_validation.py:372: FitFailedWarning:
35 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/venv-bfr-py39/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/venv-bfr-py39/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/examples/_lda.py", line 96, in fit
    self.base_regressor.fit(X, y_new, **kwargs)
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/examples/_fpls.py", line 209, in fit
    raise AttributeError(
AttributeError: n_components cannot be higher that the number of basis elements (10 > 8).

--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/venv-bfr-py39/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/venv-bfr-py39/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/examples/_lda.py", line 96, in fit
    self.base_regressor.fit(X, y_new, **kwargs)
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/examples/_fpls.py", line 209, in fit
    raise AttributeError(
AttributeError: n_components cannot be higher that the number of basis elements (4 > 3).

--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/venv-bfr-py39/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/venv-bfr-py39/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/examples/_lda.py", line 96, in fit
    self.base_regressor.fit(X, y_new, **kwargs)
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/examples/_fpls.py", line 209, in fit
    raise AttributeError(
AttributeError: n_components cannot be higher that the number of basis elements (5 > 3).

--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/venv-bfr-py39/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/venv-bfr-py39/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/examples/_lda.py", line 96, in fit
    self.base_regressor.fit(X, y_new, **kwargs)
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/examples/_fpls.py", line 209, in fit
    raise AttributeError(
AttributeError: n_components cannot be higher that the number of basis elements (10 > 3).

--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/venv-bfr-py39/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/venv-bfr-py39/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/examples/_lda.py", line 96, in fit
    self.base_regressor.fit(X, y_new, **kwargs)
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/examples/_fpls.py", line 209, in fit
    raise AttributeError(
AttributeError: n_components cannot be higher that the number of basis elements (10 > 5).

--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/venv-bfr-py39/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/venv-bfr-py39/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/examples/_lda.py", line 96, in fit
    self.base_regressor.fit(X, y_new, **kwargs)
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/examples/_fpls.py", line 209, in fit
    raise AttributeError(
AttributeError: n_components cannot be higher that the number of basis elements (10 > 7).

--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/venv-bfr-py39/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/venv-bfr-py39/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/examples/_lda.py", line 96, in fit
    self.base_regressor.fit(X, y_new, **kwargs)
  File "/home/antcc/MCD/TFM/bayesian-functional-regression/examples/_fpls.py", line 209, in fit
    raise AttributeError(
AttributeError: n_components cannot be higher that the number of basis elements (10 > 9).

 /home/antcc/MCD/TFM/bayesian-functional-regression/venv-bfr-py39/lib/python3.9/site-packages/sklearn/model_selection/_search.py:969: UserWarning:One or more of the test scores are non-finite: [0.77 0.74 0.75 0.71  nan 0.79 0.75 0.77 0.71 0.72 0.79 0.75 0.77 0.71
 0.71 0.79 0.75 0.78 0.7  0.7  0.79 0.74 0.77 0.69 0.68 0.76 0.74  nan
  nan  nan 0.77 0.75 0.74 0.73  nan 0.76 0.74 0.79 0.77  nan 0.76 0.74
 0.73 0.73  nan 0.78 0.69 0.73 0.73 0.7 ]
Fitting sk_flda_apls...
Estimator Features Accuracy
rmh+sk_svm_rbf 2 0.720
fpca+sk_logistic 2 0.700
fpca+sk_svm_lin 2 0.700
pca+sk_svm_rbf 3 0.700
fpca+sk_svm_rbf 10 0.700
pca+sk_logistic 2 0.680
sk_fknn K=5 0.680
manual_sel+sk_logistic 100 0.680
rmh+sk_svm_lin 2 0.680
sk_flda_apls 2 0.680
rkvs+sk_svm_rbf 3 0.680
pca+sk_svm_lin 2 0.680
manual_sel+sk_svm_rbf 5 0.680
sk_fnc 100 0.680
sk_lda_pca+ridge 2 0.660
rmh+sk_logistic 2 0.660
manual_sel+sk_svm_lin 100 0.660
rkvs+sk_svm_lin 4 0.660
sk_lda_fpca+ridge 2 0.660
sk_lda_rmh+ridge 2 0.660
pls+sk_svm_rbf 2 0.660
rkvs+sk_logistic 3 0.660
pls+sk_logistic 2 0.660
sk_lda_lasso 2 0.640
sk_lda_manual+ridge 10 0.640
pls+sk_svm_lin 2 0.620
sk_flda_l2_basis 7 0.600
sk_flda_fpls1_basis 2 0.600
sk_lda_pls1 2 0.580
sk_lda_pls+ridge 2 0.580
sk_mdc 100 0.580

Maximum Likelihood Estimator

In [11]:
# -- Negative log-likelihood definition in transformed parameter space

def neg_ll(theta_tr, X, Y, theta_space):
    """Transformed parameter vector 'theta_tr' is (β, τ, α0, log σ)."""
    n, N = X.shape
    grid = theta_space.grid

    assert len(theta_tr) == theta_space.ndim

    beta, tau, alpha0, _ = theta_space.get_params(theta_tr)

    idx = np.abs(grid - tau[:, np.newaxis]).argmin(1)
    X_tau = X[:, idx]
    lin_comp = alpha0 + X_tau@beta

    return -np.sum(lin_comp*Y - np.logaddexp(0, lin_comp))
In [12]:
# -- MLE estimation functions

def optimizer_global(random_state, args):
    theta_init, X, Y, theta_space, method, bounds = args
    return scipy.optimize.basinhopping(
        neg_ll,
        x0=theta_init,
        seed=random_state,
        minimizer_kwargs={"args": (X, Y, theta_space),
                          "method": method,
                          "bounds": bounds}
    ).x


def compute_mle(
        theta_space, X, Y,
        method='Powell',
        strategy='global',
        rng=None):
    if rng is None:
        rng = np.random.default_rng()

    p = theta_space.p

    theta_init = theta_space.forward(
        np.array([0.0]*p + [0.5]*p + [0.0] + [1.0]))

    if TRANSFORM_TAU:
        bounds = None
    else:
        bounds = ([(None, None)]*p
                  + [(theta_space.tau_lb, theta_space.tau_ub)]*p
                  + [(None, None)]
                  + [(None, None)])

    with utils.IgnoreWarnings():
        if strategy == 'local':
            mle_theta = scipy.optimize.minimize(
                neg_ll,
                x0=theta_init,
                bounds=bounds,
                method=method,
                args=(X, Y, theta_space)
            ).x
            bic = utils.compute_bic(theta_space, neg_ll, mle_theta, X, Y)
        elif strategy == 'global':
            mles = np.zeros((N_CORES, theta_space.ndim))

            with Pool(N_CORES) as pool:
                print(f"-- Computing MLE with {N_CORES} independent runs --")
                random_states = [rng.integers(2**32) for i in range(N_CORES)]
                args_optim = [theta_init, X, Y, theta_space, method, bounds]
                mles = pool.starmap(
                    optimizer_global, product(random_states, [args_optim]))
                bics = utils.bic = utils.compute_bic(
                    theta_space, neg_ll, mles, X, Y)
                mle_theta = mles[np.argmin(bics)]
                bic = bics[np.argmin(bics)]
        else:
            raise ValueError(
                "Parameter 'strategy' must be one of {'local', 'global'}.")

    return mle_theta, bic
In [13]:
# -- MLE estimation

method_mle = 'L-BFGS-B'  # 'Nelder-Mead', 'Powell' or 'L-BFGS-B'
strategy_mle = 'global'


mle_theta_tr, bic = compute_mle(
    theta_space, X, Y, method_mle,
    strategy_mle, rng)
mle_theta = theta_space.backward(mle_theta_tr)
Y_hat_mle = utils.generate_response_logistic(X_test, mle_theta, prob=False)
df_metrics_mle = pd.DataFrame(columns=results_columns)
metrics_mle = utils.classification_metrics(Y_test, Y_hat_mle)
df_metrics_mle.loc[0] = [
    "mle",
    p_hat,
    metrics_mle["acc"]
]

print(f"\nBIC: {bic:.3f}")
print("MLE:")
display(pd.DataFrame(zip(theta_space.labels, mle_theta),
                     columns=["", "MLE"]).style.hide_index())
print("Classification metrics:")
df_metrics_mle.style.hide_index()
-- Computing MLE with 4 independent runs --

BIC: 135.052
MLE:
MLE
$\beta_1$ -22.780
$\beta_2$ 9.061
$\beta_3$ 1.707
$\tau_1$ 0.027
$\tau_2$ 0.085
$\tau_3$ 0.952
$\alpha_0$ -0.099
$\sigma^2$ 2.194
Classification metrics:
Out[13]:
Estimator Features Accuracy
mle 3 0.660

The Ensemble Sampler and the emcee library

In [14]:
import emcee

Model

We only need to provide the sampler with the logarithm of the posterior distribution. For clarity we split up its computation in log-prior and log-likelihood, although for a more efficient implementation it should all be in one function.

In [15]:
# -- Log-posterior model

def log_prior(theta_tr):
    """Global parameters (for efficient parallelization): 
        X, b0, g, eta, theta_space"""
    assert len(theta_tr) == theta_space.ndim

    n, N = X.shape
    p = theta_space.p
    grid = theta_space.grid

    theta = theta_space.backward(theta_tr)
    beta, tau, alpha0, sigma2 = theta_space.get_params(theta)
    log_sigma = theta_space.get_sigma2(theta_tr)

    if not TRANSFORM_TAU:
        if (tau < theta_space.tau_lb).any() or (tau > theta_space.tau_ub).any():
            return -np.inf

    # Transform variables
    b = beta - b0

    # Compute and regularize G_tau
    idx = np.abs(grid - tau[:, np.newaxis]).argmin(1)
    X_tau = X[:, idx]
    G_tau = X_tau.T@X_tau
    G_tau = (G_tau + G_tau.T)/2.  # Enforce symmetry
    G_tau_reg = G_tau + eta * \
        np.max(np.linalg.eigvalsh(G_tau))*np.identity(p)

    # Compute log-prior
    log_prior = (0.5*utils.logdet(G_tau_reg)
                 - p*log_sigma
                 - b.T@G_tau_reg@b/(2*g*sigma2))

    return log_prior


def log_likelihood(theta_tr, Y):
    """Global parameters (for efficient parallelization): 
        X, theta_space, return_ll"""
    n, N = X.shape
    grid = theta_space.grid

    assert len(theta_tr) == theta_space.ndim

    beta, tau, alpha0, _ = theta_space.get_params(theta_tr)

    idx = np.abs(grid - tau[:, np.newaxis]).argmin(1)
    X_tau = X[:, idx]
    lin_comp = alpha0 + X_tau@beta
    ll_pointwise = lin_comp*Y - np.logaddexp(0, lin_comp)
    ll = np.sum(ll_pointwise)

    if return_ll:
        return ll, ll_pointwise
    else:
        return ll


def log_posterior(theta_tr, Y):
    """Global parameters (for efficient parallelization): 
        X, rng, return_pp, return_ll, theta_space"""
    # Compute log-prior
    lp = log_prior(theta_tr)

    if not np.isfinite(lp):
        if return_pp and return_ll:
            return (-np.inf, np.full_like(Y, -1.0),
                    np.full_like(Y, -1), np.full_like(Y, -np.inf))
        elif return_pp:
            return -np.inf, np.full_like(Y, -1.0), np.full_like(Y, -1)
        elif return_ll:
            return -np.inf, np.full_like(Y, -np.inf)
        else:
            return -np.inf

    # Compute log-likelihood (and possibly pointwise log-likelihood)
    if return_ll:
        ll, ll_pointwise = log_likelihood(theta_tr, Y)
    else:
        ll = log_likelihood(theta_tr, Y)

    # Compute log-posterior
    lpos = lp + ll

    # Compute posterior predictive samples
    if return_pp:
        theta = theta_space.backward(theta_tr)
        pp_y, pp_p = utils.generate_response_logistic(
            X, theta, return_p=True, rng=rng)

    # Return information
    if return_pp and return_ll:
        return lpos, pp_p, pp_y, ll_pointwise
    elif return_pp:
        return lpos, pp_p, pp_y
    elif return_ll:
        return lpos, ll_pointwise
    else:
        return lpos

Experiments

We set up the initial points of the chains to be in a random neighbourhood around the MLE to increase the speed of convergence.

In [16]:
def run_emcee(n_walkers, n_iter_initial, n_iter, moves,
              thin, thin_pp, return_pp, return_ll):
    # -- Run sampler

    with Pool(N_CORES) as pool:
        print(
            f"-- Running affine-invariant ensemble sampler with {N_CORES} cores --")

        sampler = emcee.EnsembleSampler(
            n_walkers, theta_ndim, log_posterior,
            pool=pool, args=(Y,),
            moves=moves)

        print("Tuning phase...")
        state = sampler.run_mcmc(
            p0, n_iter_initial, progress='notebook',
            store=False)
        sampler.reset()

        print("MCMC sampling...")
        sampler.run_mcmc(state, n_iter, progress='notebook')

    print(
        f"Mean acceptance fraction: {100*np.mean(sampler.acceptance_fraction):.3f}%")

    logging.disable(sys.maxsize)  # Disable logger

    # Analyze autocorrelation and set burn-in and thinning values
    autocorr = sampler.get_autocorr_time(quiet=True)
    max_autocorr = np.max(autocorr)
    if (np.isfinite(autocorr)).all():
        burn = int(3*max_autocorr)
    else:
        print("Some autocorrelation value is not finite")
        burn = 500

    logging.disable(logging.NOTSET)  # Re-enable logger

    # Get InferenceData object
    idata_emcee = utils.emcee_to_idata(
        sampler, theta_space, burn, thin,
        ["p_star", "y_star"] if return_pp else [],
        return_ll)

    print("\n-- Summary statistics --")
    display(utils.summary(idata_emcee, var_names=theta_names,
                          kind="stats", labeller=theta_labeller))

    # -- Compute metrics using several point estimates

    df_metrics_emcee = pd.DataFrame(columns=results_columns)

    # Posterior mean estimate
    pp_test_p, pp_test_y = utils.generate_pp(
        idata_emcee, X_test, theta_names,
        thin_pp, rng=rng,
        kind='classification')

    print("Computing metrics...", end="\r")

    Y_hat_pp_mean = [utils.threshold(y) for y in pp_test_p.mean(axis=(0, 1))]
    Y_hat_pp_vote = [utils.threshold(y) for y in pp_test_y.mean(axis=(0, 1))]
    metrics_pp_mean = utils.classification_metrics(Y_test, Y_hat_pp_mean)
    metrics_pp_vote = utils.classification_metrics(Y_test, Y_hat_pp_vote)
    df_metrics_emcee.loc[0] = [
        "emcee_posterior_mean",
        p_hat,
        metrics_pp_mean["acc"]
    ]
    df_metrics_emcee.loc[1] = [
        "emcee_posterior_vote",
        p_hat,
        metrics_pp_vote["acc"]
    ]

    # Point estimates
    for i, pe in enumerate(point_estimates):
        Y_hat_pe = utils.point_predict(
            X_test, idata_emcee,
            theta_names, pe,
            kind='classification')
        metrics_pe = utils.classification_metrics(Y_test, Y_hat_pe)
        df_metrics_emcee.loc[i + 2] = [
            "emcee_" + pe,
            p_hat,
            metrics_pe["acc"]
        ]

    # Bayesian variable selection
    for pe in point_estimates:
        df_metrics_var_sel = bayesian_var_sel(
            idata_emcee, theta_space, theta_names, X_fd,
            Y, X_test_fd, Y_test, folds, prefix="emcee",
            point_est=pe)

        df_metrics_emcee = df_metrics_emcee.append(df_metrics_var_sel)

    df_metrics_emcee = df_metrics_emcee.append(df_metrics_mle)
    if FIT_SK:
        df_metrics_emcee = df_metrics_emcee.append(df_metrics_sk)

    df_metrics_emcee.sort_values(
        results_columns[-1], inplace=True, ascending=False)

    print("-- Classification metrics --")
    display(df_metrics_emcee.style.hide_index())

    return sampler, idata_emcee, df_metrics_emcee
In [17]:
# -- Sampler parameters

n_walkers = 64
n_iter_initial = 100
n_iter = 1000
return_pp = True
return_ll = True
frac_random = 0.3

sd_beta_init = 1.0
sd_tau_init = 0.2
mean_alpha0_init = 0.0
sd_alpha0_init = 1.0
param_sigma2_init = 2.0  # shape parameter in inv_gamma distribution
sd_sigma2_init = 1.0

moves = [
    (emcee.moves.StretchMove(), 0.7),
    (emcee.moves.WalkMove(), 0.3)
]

thin = 1
thin_pp = 5

FAST_RUN = True
In [18]:
# -- Run sampler

# Start every walker in a (random) neighbourhood around the MLE
p0 = utils.weighted_initial_guess_around_value(
    theta_space, mle_theta_tr, sd_beta_init, sd_tau_init,
    mean_alpha0_init, sd_alpha0_init, param_sigma2_init,
    sd_sigma2_init, n_walkers=n_walkers, rng=rng,
    frac_random=frac_random)

b0 = mle_theta_tr[theta_space.beta_idx]

if FAST_RUN:
    sampler, idata_emcee, df_metrics_emcee_full = run_emcee(
        n_walkers, n_iter_initial, n_iter,
        moves, thin, thin_pp, return_pp, return_ll)
else:
    with Pool(N_CORES) as pool:
        print(
            f"-- Running affine-invariant ensemble sampler with {N_CORES} cores --")

        sampler = emcee.EnsembleSampler(
            n_walkers, theta_ndim, log_posterior,
            pool=pool, args=(Y,),
            moves=moves)

        print("Tuning phase...")
        state = sampler.run_mcmc(
            p0, n_iter_initial, progress='notebook',
            store=False)
        sampler.reset()

        print("MCMC sampling...")
        sampler.run_mcmc(state, n_iter, progress='notebook')

    print(
        f"Mean acceptance fraction: {100*np.mean(sampler.acceptance_fraction):.3f}%")
-- Running affine-invariant ensemble sampler with 4 cores --
Tuning phase...
  0%|          | 0/100 [00:00<?, ?it/s]
MCMC sampling...
  0%|          | 0/1000 [00:00<?, ?it/s]
Mean acceptance fraction: 22.394%

-- Summary statistics --
mean sd hdi_3% hdi_97% mode median
β[0] -21.373 6.109 -31.487 -10.438 -22.730 -22.762
β[1] 8.279 2.786 1.561 12.505 9.098 9.000
β[2] 1.666 0.588 0.990 2.720 1.698 1.721
τ[0] 0.062 0.157 0.000 0.054 0.031 0.029
τ[1] 0.113 0.088 0.010 0.156 0.111 0.102
τ[2] 0.852 0.177 0.684 1.000 0.899 0.897
$\alpha_0$ -0.126 0.245 -0.589 0.344 -0.115 -0.124
$\sigma^2$ 153.903 850.794 0.000 193.494 19.467 0.564
Posterior predictive samples:   0%|          | 0/64 [00:00<?, ?it/s]
Computing metrics...
 /home/antcc/MCD/TFM/bayesian-functional-regression/venv-bfr-py39/lib/python3.9/site-packages/sklearn/svm/_base.py:1206: ConvergenceWarning:Liblinear failed to converge, increase the number of iterations.
-- Classification metrics --
Estimator Features Accuracy
rmh+sk_svm_rbf 2 0.720
emcee_mode+sk_svm_rbf 3 0.720
emcee_median+sk_svm_rbf 3 0.720
fpca+sk_logistic 2 0.700
emcee_mean+sk_logistic 3 0.700
fpca+sk_svm_rbf 10 0.700
emcee_median+sk_logistic 3 0.700
pca+sk_svm_rbf 3 0.700
emcee_mean+sk_svm_rbf 3 0.700
fpca+sk_svm_lin 2 0.700
sk_flda_apls 2 0.680
manual_sel+sk_logistic 100 0.680
rkvs+sk_svm_rbf 3 0.680
pca+sk_svm_lin 2 0.680
sk_fknn K=5 0.680
pca+sk_logistic 2 0.680
manual_sel+sk_svm_rbf 5 0.680
sk_fnc 100 0.680
rmh+sk_svm_lin 2 0.680
emcee_mode+sk_svm_lin 3 0.680
sk_lda_fpca+ridge 2 0.660
mle 3 0.660
sk_lda_rmh+ridge 2 0.660
pls+sk_svm_rbf 2 0.660
emcee_posterior_vote 3 0.660
rkvs+sk_logistic 3 0.660
pls+sk_logistic 2 0.660
sk_lda_pca+ridge 2 0.660
rmh+sk_logistic 2 0.660
manual_sel+sk_svm_lin 100 0.660
rkvs+sk_svm_lin 4 0.660
sk_lda_lasso 2 0.640
sk_lda_manual+ridge 10 0.640
emcee_posterior_mean 3 0.640
emcee_median+sk_svm_lin 3 0.640
emcee_mean+sk_svm_lin 3 0.640
emcee_mode+sk_logistic 3 0.640
emcee_median 3 0.640
emcee_mode 3 0.640
emcee_mean 3 0.620
pls+sk_svm_lin 2 0.620
sk_flda_l2_basis 7 0.600
sk_flda_fpls1_basis 2 0.600
sk_lda_pls1 2 0.580
sk_lda_pls+ridge 2 0.580
sk_mdc 100 0.580

Analysis

We analyze the samples of all chains, discarding a few times the integrated autocorrelation times worth of samples. We could also perform thinning and take only every $k$-th value.

In [19]:
# -- Sampler statistics and trace (with burn-in and thinning)

logging.disable(sys.maxsize)  # Disable logger

# Analyze autocorrelation and set burn-in and thinning values
autocorr = sampler.get_autocorr_time(quiet=True)
max_autocorr = np.max(autocorr)
if (np.isfinite(autocorr)).all():
    burn = int(3*max_autocorr)
else:
    print("Some autocorrelation value is not finite")
    burn = 500

# Get trace of samples
trace_flat = utils.get_trace_emcee(sampler, theta_space, burn, thin, flat=True)

# Get InferenceData object
idata_emcee = utils.emcee_to_idata(
    sampler, theta_space, burn, thin,
    ["p_star", "y_star"] if return_pp else [],
    return_ll)

# Update and show autocorrelation
autocorr_thin = sampler.get_autocorr_time(discard=burn, thin=thin, quiet=True)

logging.disable(logging.NOTSET)  # Re-enable logger

pd.DataFrame(
    zip(theta_labels, autocorr_thin, len(trace_flat)/autocorr_thin),
    columns=["", "Autocorrelation times", "Effective i.i.d samples"]
).style.hide_index()
Out[19]:
Autocorrelation times Effective i.i.d samples
$\beta_1$ 51.465 886.662
$\beta_2$ 60.461 754.739
$\beta_3$ 56.990 800.698
$\tau_1$ 54.561 836.343
$\tau_2$ 62.573 729.261
$\tau_3$ 61.955 736.530
$\alpha_0$ 52.866 863.164
$\sigma^2$ 69.474 656.823
In [20]:
utils.summary(idata_emcee, var_names=theta_names,
              kind="stats", labeller=theta_labeller)
Out[20]:
mean sd hdi_3% hdi_97% mode median
β[0] -21.373 6.109 -31.487 -10.438 -22.730 -22.762
β[1] 8.279 2.786 1.561 12.505 9.098 9.000
β[2] 1.666 0.588 0.990 2.720 1.698 1.721
τ[0] 0.062 0.157 0.000 0.054 0.031 0.029
τ[1] 0.113 0.088 0.010 0.156 0.111 0.102
τ[2] 0.852 0.177 0.684 1.000 0.899 0.897
$\alpha_0$ -0.126 0.245 -0.589 0.344 -0.115 -0.124
$\sigma^2$ 153.903 850.794 0.000 193.494 19.467 0.564
In [21]:
az.plot_trace(idata_emcee, labeller=theta_labeller,
              combined=True, var_names=theta_names)
print("Combined density and trace plot:")
Combined density and trace plot:
In [22]:
az.plot_posterior(idata_emcee, labeller=theta_labeller, point_estimate='mode',
                  grid=(NROWS(theta_ndim), NCOLS), textsize=20,
                  var_names=theta_names)
print("Marginal posterior distributions:")
Marginal posterior distributions:

In this case, since the outcome variable is binary, we plot the distribution of $T(Y^*)$ to visually compare it with $T(Y)$, where $T$ is the number of $1$s in the sample (the number of successes).

We also show a separation plot, in which the predicted probabilities $\hat p_i$ (suitably averaged for each chain and sample) are ordered in an ascending manner, and then a vertical line is drawn on each of them either in a dark color (if $y_i$ is 1) or a light color (if $y_i$ is 0). In a perfect model, all the dark lines would be on the rightmost part, effectively "separating" the samples.

In [23]:
# -- Generate and plot posterior predictive checks from X

if "posterior_predictive" not in idata_emcee:
    pp_p, pp_y = utils.generate_pp(
        idata_emcee, X, theta_names,
        rng=rng, kind='classification')
    utils.pp_to_idata([pp_p, pp_y], idata_emcee,
                      ["p_star", "y_star"], merge=True)
else:
    pp_p = idata_emcee.posterior_predictive['p_star'].to_numpy()
    pp_y = idata_emcee.posterior_predictive['y_star'].to_numpy().astype(int)

# Posterior predictive checks
fig, axs = plt.subplots(1, 2, figsize=(10, 4))
plt.suptitle(r"Posterior predictive checks on $X$")

n_success = pp_y.reshape(-1, len(Y)).sum(axis=1)
az.plot_dist(n_success, label=r"$T(Y^*)$", ax=axs[0])
axs[0].axvline(n_success.mean(), ls="--", color="orange",
               lw=2, label=r"$\overline{T(Y^*)}$")
axs[0].axvline(Y.sum(), ls="--", color="red",
               lw=2, label=r"T(Y)")
axs[0].set_title("T = No. of successes")
axs[0].legend()
axs[0].set_yticks([])
axs[0].tick_params(labelsize=8)

az.plot_bpv(idata_emcee, kind='t_stat', t_stat='mean', data_pairs={
            'y_obs': 'y_star'}, plot_mean=False, ax=axs[1], bpv=False)
axs[1].axvline(Y.mean(), ls="--", color="r",
               lw=2, label=r"$\bar Y$")
handles, labels = axs[1].get_legend_handles_labels()
handles.extend([Line2D([0], [0], label=r"Distribution of $\bar Y^*$")])
axs[1].legend(handles=handles)

# Separation plot
az.plot_separation(idata_emcee, y="y_obs", y_hat="p_star", y_hat_line=True,
                   figsize=(10, 1), legend=False)
plt.title("Separation plot", fontsize=12)

# Show Bayesian p-values
for name, stat in statistics:
    bpv = utils.bpv(pp_y, Y, stat)
    print(f"bpv [T={name}]: {bpv:.3f}")
bpv [T=min]: 1.000
bpv [T=max]: 1.000
bpv [T=median]: 0.560
bpv [T=mean]: 0.560
bpv [T=std]: 0.844
In [24]:
az.plot_autocorr(idata_emcee, combined=True, var_names=theta_names,
                 grid=(NROWS(theta_ndim), NCOLS), labeller=theta_labeller)
print("Combined autocorrelation times:")
Combined autocorrelation times:

Out-of-sample predictions

In [25]:
# -- Generate and plot posterior predictive samples from X_test

pp_test_p, pp_test_y = utils.generate_pp(
    idata_emcee, X_test, theta_names,
    rng=rng, kind='classification')
idata_pp_test = utils.pp_to_idata(
    [pp_test_p, pp_test_y], idata_emcee, ["p_star", "y_star"], y_obs=Y_test)

# Posterior predictive checks
fig, axs = plt.subplots(1, 2, figsize=(10, 4))
plt.suptitle(r"Posterior predictive checks on $X_{test}$")

n_success_test = pp_test_y.reshape(-1, len(Y_test)).sum(axis=1)
az.plot_dist(n_success_test, label=r"$T(Y_{test}^*)$", ax=axs[0])
axs[0].axvline(n_success_test.mean(), ls="--", color="orange",
               lw=2, label=r"$\overline{T(Y_{test}^*)}$")
axs[0].axvline(Y_test.sum(), ls="--", color="red",
               lw=2, label=r"$T(Y_{test})$")
axs[0].set_title("T = No. of successes")
axs[0].legend()
axs[0].set_yticks([])
axs[0].tick_params(labelsize=8)

az.plot_bpv(idata_pp_test, kind='t_stat', t_stat='mean', data_pairs={
            'y_obs': 'y_star'}, plot_mean=False, ax=axs[1], bpv=False)
axs[1].axvline(Y_test.mean(), ls="--", color="r",
               lw=2, label=r"$\bar Y_{test}$")
handles, labels = axs[1].get_legend_handles_labels()
handles.extend([Line2D([0], [0], label=r"Distribution of $\bar Y_{test}^*$")])
axs[1].legend(handles=handles)

# Separation plot
az.plot_separation(idata_pp_test, y="y_obs", y_hat="p_star", y_hat_line=True,
                   figsize=(10, 1), legend=False)
plt.title("Separation plot", fontsize=12)

# Show Bayesian p-values
for name, stat in statistics:
    bpv = utils.bpv(pp_test_y, Y_test, stat)
    print(f"bpv [T={name}]: {bpv:.3f}")
Posterior predictive samples:   0%|          | 0/64 [00:00<?, ?it/s]
bpv [T=min]: 1.000
bpv [T=max]: 1.000
bpv [T=median]: 0.502
bpv [T=mean]: 0.502
bpv [T=std]: 1.000
In [26]:
# -- Compute metrics using several point estimates

df_metrics_emcee = pd.DataFrame(columns=results_columns)

# Posterior mean estimate
Y_hat_pp_mean = [utils.threshold(y)
                 for y in pp_test_p[:, ::thin_pp, :].mean(axis=(0, 1))]
metrics_pp_mean = utils.classification_metrics(Y_test, Y_hat_pp_mean)
Y_hat_pp_vote = [utils.threshold(y)
                 for y in pp_test_y[:, ::thin_pp, :].mean(axis=(0, 1))]
metrics_pp_vote = utils.classification_metrics(Y_test, Y_hat_pp_vote)
df_metrics_emcee.loc[0] = [
    "emcee_posterior_mean",
    p_hat,
    metrics_pp_mean["acc"]
]
df_metrics_emcee.loc[1] = [
    "emcee_posterior_vote",
    p_hat,
    metrics_pp_vote["acc"]
]

# Point estimates
for i, pe in enumerate(point_estimates):
    Y_hat_pe = utils.point_predict(
        X_test, idata_emcee,
        theta_names, pe, kind='classification')
    metrics_pe = utils.classification_metrics(Y_test, Y_hat_pe)
    df_metrics_emcee.loc[i + 2] = [
        "emcee_" + pe,
        p_hat,
        metrics_pe["acc"],
    ]

df_metrics_emcee.sort_values(
    results_columns[-1], inplace=True, ascending=False)
df_metrics_emcee.style.hide_index()
Out[26]:
Estimator Features Accuracy
emcee_posterior_mean 3 0.640
emcee_posterior_vote 3 0.640
emcee_mode 3 0.640
emcee_median 3 0.640
emcee_mean 3 0.620
In [27]:
# -- Test variable selection procedure

df_metrics_emcee_var_sel = pd.DataFrame(columns=results_columns)

for pe in point_estimates:
    df_var_sel = bayesian_var_sel(
        idata_emcee, theta_space, theta_names, X_fd,
        Y, X_test_fd, Y_test, folds, prefix="emcee",
        point_est=pe)

    df_metrics_emcee_var_sel = df_metrics_emcee_var_sel.append(df_var_sel)

df_metrics_emcee_var_sel.sort_values(
    results_columns[-1], inplace=True, ascending=False)
df_metrics_emcee_var_sel.style.hide_index()
 /home/antcc/MCD/TFM/bayesian-functional-regression/venv-bfr-py39/lib/python3.9/site-packages/sklearn/svm/_base.py:1206: ConvergenceWarning:Liblinear failed to converge, increase the number of iterations.
Out[27]:
Estimator Features Accuracy
emcee_mode+sk_svm_rbf 3 0.720
emcee_median+sk_svm_rbf 3 0.720
emcee_mean+sk_logistic 3 0.700
emcee_mean+sk_svm_rbf 3 0.700
emcee_median+sk_logistic 3 0.700
emcee_mode+sk_svm_lin 3 0.680
emcee_mode+sk_logistic 3 0.640
emcee_mean+sk_svm_lin 3 0.640
emcee_median+sk_svm_lin 3 0.640

Save & Load

This is only for testing purposes; in a production environment one should use the Backends feature of emcee.

In [ ]:
# -- Save

with open("emcee-p-fixed.idata", 'wb') as file:
    pickle.dump(idata_emcee, file)
In [ ]:
# -- Load

with open("emcee-p-fixed.idata", 'rb') as file:
    idata_emcee = pickle.load(file)
    trace = idata_emcee.posterior.to_array().to_numpy().T
    trace_flat = trace.reshape(-1, trace.shape[-1])  # All chains combined

The PyMC library

In [28]:
import pymc3 as pm
import theano
import theano.tensor as tt

Model

In [29]:
# -- Probabilistic model

def make_model(theta_space, g, eta, X, Y, names, names_aux, mle_theta=None):
    n, N = X.shape
    grid = theta_space.grid
    p = theta_space.p

    if mle_theta is not None:
        b0 = mle_theta[:p]
    else:
        b0 = g*rng.standard_normal(size=p)  # <-- Change if needed

    with pm.Model() as model:
        X_pm = pm.Data('X', X)

        alpha0_and_log_sigma = pm.DensityDist(
            names_aux[0], lambda x: 0, shape=(2,))

        alpha0 = pm.Deterministic(names[-2], alpha0_and_log_sigma[0])

        log_sigma = alpha0_and_log_sigma[1]
        sigma = pm.math.exp(log_sigma)
        sigma2 = pm.Deterministic(names[-1], sigma**2)

        tau = pm.Uniform(names[1], 0.0, 1.0, shape=(p,))

        idx = np.abs(grid - tau[:, np.newaxis]).argmin(1)
        X_tau = X_pm[:, idx]
        G_tau = pm.math.matrix_dot(X_tau.T, X_tau)
        G_tau = (G_tau + G_tau.T)/2.  # Enforce symmetry
        G_tau_reg = G_tau + eta * \
            tt.max(tt.nlinalg.eigh(G_tau)[0])*np.identity(p)

        def beta_lprior(x):
            b = x - b0

            return (0.5*pm.math.logdet(G_tau_reg)
                    - p*log_sigma
                    - pm.math.matrix_dot(b.T, G_tau_reg, b)/(2.*g*sigma2))

        beta = pm.DensityDist(names[0], beta_lprior, shape=(p,))

        px = pm.Deterministic(
            'p_star',
            pm.math.invlogit(alpha0 + pm.math.matrix_dot(X_tau, beta)))

        y_obs = pm.Bernoulli('y_obs', p=px, observed=Y)

    return model

Experiments

In [30]:
# -- Hyperparameters

burn = 0
thin = 1
thin_pp = 5

n_samples_nuts = 1000
tune_nuts = 1200
target_accept = 0.81
n_samples_metropolis = 10000
tune_metropolis = 3000

USE_NUTS = False
In [31]:
# -- Run sampler

model = make_model(theta_space, g, eta, X, Y, theta_names,
                   theta_names_aux[:1], mle_theta_tr)

with model:
    if USE_NUTS:
        idata_pymc = pm.sample(n_samples_nuts, cores=2,
                               tune=tune_nuts, target_accept=0.81,
                               return_inferencedata=True)
    else:
        step = pm.Metropolis()
        idata_pymc = pm.sample(n_samples_metropolis, cores=2,
                               tune=tune_metropolis, step=step,
                               return_inferencedata=True)

    idata_pymc = idata_pymc.sel(draw=slice(burn, None, thin))
Multiprocess sampling (2 chains in 2 jobs)
CompoundStep
>Metropolis: [β]
>Metropolis: [τ]
>Metropolis: [α0 and log σ]
100.00% [26000/26000 00:26<00:00 Sampling 2 chains, 0 divergences]
Sampling 2 chains for 3_000 tune and 10_000 draw iterations (6_000 + 20_000 draws total) took 26 seconds.
The rhat statistic is larger than 1.4 for some parameters. The sampler did not converge.
The estimated number of effective samples is smaller than 200 for some parameters.

Analysis

Since the tuning iterations already serve as burn-in, we keep the whole trace. In addition, we could consider thinning the samples.

In [32]:
utils.summary(idata_pymc, var_names=theta_names, labeller=theta_labeller)
Out[32]:
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_bulk ess_tail r_hat mode median
β[0] -15.186 10.865 -28.207 4.182 6.153 4.838 4.0 33.0 1.56 -22.745 -21.324
β[1] 6.591 4.330 -1.804 13.921 1.090 0.786 18.0 30.0 1.23 9.074 8.206
β[2] -0.207 5.922 -14.446 10.094 2.480 1.852 11.0 29.0 1.44 1.711 1.724
τ[0] 0.126 0.208 0.000 0.584 0.072 0.053 9.0 72.0 1.26 0.028 0.031
τ[1] 0.438 0.355 0.040 0.956 0.184 0.142 5.0 144.0 1.30 0.102 0.317
τ[2] 0.666 0.321 0.031 0.993 0.095 0.077 10.0 33.0 1.15 0.918 0.808
$\alpha_0$ -0.094 0.257 -0.550 0.418 0.005 0.003 2983.0 4008.0 1.00 -0.065 -0.088
$\sigma^2$ 1111.885 9108.506 0.003 2655.636 545.403 386.062 4.0 32.0 1.42 332.859 136.566
In [33]:
az.plot_trace(idata_pymc, var_names=theta_names, labeller=theta_labeller)
print("Density and trace plot:")
Density and trace plot:
In [34]:
az.plot_posterior(
    idata_pymc, point_estimate='mode',
    var_names=theta_names,
    labeller=theta_labeller,
    textsize=20,
    grid=(NROWS(theta_ndim), NCOLS))
print("Marginal posterior distributions:")
Marginal posterior distributions:
In [35]:
# -- Generate and plot posterior predictive samples from X

with model:
    print("Generating posterior predictive samples...")
    pp_p, pp_y = utils.generate_pp(
        idata_pymc, X, theta_names, rng=rng, kind='classification')
    utils.pp_to_idata([pp_p, pp_y], idata_pymc,
                      ['p_star', 'y_star'], merge=True)

# Posterior predictive checks
fig, axs = plt.subplots(1, 2, figsize=(10, 4))
plt.suptitle(r"Posterior predictive checks on $X$")

n_success = pp_y.reshape(-1, len(Y)).sum(axis=1)
az.plot_dist(n_success, label=r"$T(Y^*)$", ax=axs[0])
axs[0].axvline(n_success.mean(), ls="--", color="orange",
               lw=2, label=r"$\overline{T(Y^*)}$")
axs[0].axvline(Y.sum(), ls="--", color="red",
               lw=2, label=r"$T(Y)$")
axs[0].set_title("T = No. of successes")
axs[0].legend()
axs[0].set_yticks([])
axs[0].tick_params(labelsize=8)

az.plot_bpv(idata_pymc, kind='t_stat', t_stat='mean',
            plot_mean=False, ax=axs[1], bpv=False,
            data_pairs={"y_obs": "y_star"})
axs[1].axvline(Y.mean(), ls="--", color="r",
               lw=2, label=r"$\bar Y$")
handles, labels = axs[1].get_legend_handles_labels()
handles.extend([Line2D([0], [0], label=r"Distribution of $\bar Y^*$")])
axs[1].legend(handles=handles)

# Separation plot
az.plot_separation(idata_pymc, y="y_obs", y_hat="p_star", y_hat_line=True,
                   figsize=(10, 1), legend=False)
plt.title("Separation plot", fontsize=12)

# Show Bayesian p-values
for name, stat in statistics:
    bpv = utils.bpv(pp_y, Y, stat)
    print(f"bpv [T={name}]: {bpv:.3f}")
Generating posterior predictive samples...
Posterior predictive samples:   0%|          | 0/2 [00:00<?, ?it/s]
bpv [T=min]: 1.000
bpv [T=max]: 1.000
bpv [T=median]: 0.536
bpv [T=mean]: 0.536
bpv [T=std]: 0.853
In [36]:
az.plot_autocorr(idata_pymc, var_names=theta_names,
                 combined=True, grid=(NROWS(theta_ndim), NCOLS),
                 labeller=theta_labeller)
print("Combined autocorrelation times:")
Combined autocorrelation times:
In [39]:
print("Graphical model:")
pm.model_graph.model_to_graphviz(model)
Graphical model:
Out[39]:
cluster100 x 100 100 x 100 cluster2 2 cluster3 3 cluster100 100 X X ~ Data β β ~ DensityDist X->β p_star p_star ~ Deterministic X->p_star α0 and log σ α0 and log σ ~ DensityDist α0 α0 ~ Deterministic α0 and log σ->α0 σ2 σ2 ~ Deterministic α0 and log σ->σ2 α0 and log σ->β α0->p_star σ2->β β->p_star τ τ ~ Uniform τ->β τ->p_star y_obs y_obs ~ Bernoulli p_star->y_obs

Out-of-sample predictions

First we take a look at the distribution of predictions on a previously unseen dataset.

In [37]:
# -- Generate and plot posterior predictive samples from X_test

model_test = make_model(theta_space, g, eta, X_test, Y_test, theta_names,
                        theta_names_aux[:1], mle_theta)

with model_test:
    print("Generating posterior predictive on hold-out data...")
    pp_test_p, pp_test_y = utils.generate_pp(
        idata_pymc, X_test, theta_names, rng=rng, kind='classification')
    idata_pp_test = utils.pp_to_idata(
        [pp_test_p, pp_test_y], idata_pymc,
        ['p_star', 'y_star'], y_obs=Y_test)

# Posterior predictive checks
fig, axs = plt.subplots(1, 2, figsize=(10, 4))
plt.suptitle(r"Posterior predictive checks on $X_{test}$")

n_success_test = pp_test_y.reshape(-1, len(Y_test)).sum(axis=1)
az.plot_dist(n_success_test, label=r"$T(Y_{test}^*)$", ax=axs[0])
axs[0].axvline(n_success_test.mean(), ls="--", color="orange",
               lw=2, label=r"$\overline{T(Y_{test}^*)}$")
axs[0].axvline(Y_test.sum(), ls="--", color="red",
               lw=2, label=r"$T(Y_{test})$")
axs[0].set_title("T = No. of successes")
axs[0].legend()
axs[0].set_yticks([])
axs[0].tick_params(labelsize=8)

az.plot_bpv(idata_pp_test, kind='t_stat', t_stat='mean',
            plot_mean=False, ax=axs[1], bpv=False,
            data_pairs={"y_obs": "y_star"})
axs[1].axvline(Y_test.mean(), ls="--", color="r",
               lw=2, label=r"$\bar Y_{test}$")
handles, labels = axs[1].get_legend_handles_labels()
handles.extend([Line2D([0], [0], label=r"Distribution of $\bar Y_{test}^*$")])
axs[1].legend(handles=handles)

# Separation plot
az.plot_separation(idata_pp_test, y="y_obs", y_hat="p_star",
                   y_hat_line=True, figsize=(10, 1), legend=False)
plt.title("Separation plot", fontsize=12)

# Show Bayesian p-values
for name, stat in statistics:
    bpv = utils.bpv(pp_test_y, Y_test, stat)
    print(f"bpv [T={name}]: {bpv:.3f}")
Generating posterior predictive on hold-out data...
Posterior predictive samples:   0%|          | 0/2 [00:00<?, ?it/s]
bpv [T=min]: 1.000
bpv [T=max]: 1.000
bpv [T=median]: 0.497
bpv [T=mean]: 0.497
bpv [T=std]: 1.000

Next we look at the MSE when using several point-estimates for the parameters.

In [38]:
# -- Compute metrics using several point estimates

df_metrics_pymc = pd.DataFrame(columns=results_columns)

# Posterior mean estimate
Y_hat_pp_mean = [utils.threshold(y)
                 for y in pp_test_p[:, ::thin_pp, :].mean(axis=(0, 1))]
metrics_pp_mean = utils.classification_metrics(Y_test, Y_hat_pp_mean)
Y_hat_pp_vote = [utils.threshold(y)
                 for y in pp_test_y[:, ::thin_pp, :].mean(axis=(0, 1))]
metrics_pp_vote = utils.classification_metrics(Y_test, Y_hat_pp_vote)
df_metrics_pymc.loc[0] = [
    "pymc_posterior_mean",
    p_hat,
    metrics_pp_mean["acc"]
]
df_metrics_pymc.loc[1] = [
    "pymc_posterior_vote",
    p_hat,
    metrics_pp_vote["acc"]
]

# Point estimates
for i, pe in enumerate(point_estimates):
    Y_hat_pe = utils.point_predict(
        X_test, idata_pymc,
        theta_names, pe, kind='classification')
    metrics_pe = utils.classification_metrics(Y_test, Y_hat_pe)
    df_metrics_pymc.loc[i + 2] = [
        "pymc_" + pe,
        p_hat,
        metrics_pe["acc"],
    ]

df_metrics_pymc.sort_values(results_columns[-1], inplace=True, ascending=False)
df_metrics_pymc.style.hide_index()
Out[38]:
Estimator Features Accuracy
pymc_posterior_mean 3 0.660
pymc_posterior_vote 3 0.640
pymc_mode 3 0.640
pymc_mean 3 0.640
pymc_median 3 0.600
In [39]:
# -- Test variable selection procedure

df_metrics_pymc_var_sel = pd.DataFrame(columns=results_columns)

for pe in point_estimates:
    df_var_sel = bayesian_var_sel(
        idata_pymc, theta_space, theta_names, X_fd,
        Y, X_test_fd, Y_test, folds, prefix="pymc",
        point_est=pe)

    df_metrics_pymc_var_sel = df_metrics_pymc_var_sel.append(df_var_sel)

df_metrics_pymc_var_sel.sort_values(
    results_columns[-1], inplace=True, ascending=False)
df_metrics_pymc_var_sel.style.hide_index()
Out[39]:
Estimator Features Accuracy
pymc_mode+sk_svm_lin 3 0.680
pymc_mode+sk_svm_rbf 3 0.680
pymc_median+sk_svm_rbf 3 0.680
pymc_mean+sk_logistic 3 0.660
pymc_median+sk_logistic 3 0.660
pymc_mode+sk_logistic 3 0.640
pymc_mean+sk_svm_rbf 3 0.640
pymc_median+sk_svm_lin 3 0.640
pymc_mean+sk_svm_lin 3 0.600

Save & Load

In [ ]:
# -- Save

_ = idata_pymc.to_netcdf("pymc-p-fixed.nc")
In [ ]:
# -- Load

idata_pymc = az.from_netcdf("pymc-p-fixed.nc")

Notebook metadata

In [40]:
%load_ext watermark
%watermark -n -u -v -iv -w
Last updated: Thu Jan 20 2022

Python implementation: CPython
Python version       : 3.9.9
IPython version      : 7.30.1

autopep8  : 1.6.0
scipy     : 1.7.3
numpy     : 1.20.3
emcee     : 3.1.1
pandas    : 1.3.5
theano    : 1.1.2
pymc3     : 3.11.4
skfda     : 0.0
matplotlib: 3.5.1
sys       : 3.9.9 (main, Jan  3 2022, 14:06:06) 
[GCC 11.1.0]
arviz     : 0.11.4
logging   : 0.5.1.2
json      : 2.0.9

Watermark: 2.2.0

In [ ]: